Objective
Data Description:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy
import seaborn as sns
sns.set(font_scale=1)
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
# from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score, recall_score, precision_score, f1_score
import os
import json
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
import random
import time # To time the execution time for GridSearchCV and RandomSearchCV
from sklearn.decomposition import PCA
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
import sklearn
import imblearn
# Determine the number of CPUs and use all of them but 1
import multiprocessing
# Get CPU count
c_count = multiprocessing.cpu_count()
c_count = c_count - 1
if c_count <= 0: # If the computer has one GPU use that one.
c_count = 1
print(np.__version__)
print(sklearn.__version__)
print(imblearn.__version__)
print(pd.__version__)
print(sns.__version__)
#Load the dataset
df_original = pd.read_csv("BankChurners.csv")
# makde a copy of the loaded dataframe into a variable df.
# we will modify df throughout the process
df = df_original.copy()
df.head(2)
df.info()
df.isnull().sum()
Comments
def ProcessNulls(df, column):
count = df.loc[df[column] == 'Unknown', [column]].shape[0]
percent = count / df.shape[0]
return str(round(percent,2))+"%"
all_details = []
for column in df.columns:
all_details.append({column:ProcessNulls(df, column)})
#Format the output using JSON
print(json.dumps(all_details, indent=3))
df.describe(include="all")
Remarks:
df_original.head(2)
# Label encoding of the target variable
# 0 for existing and 1 for attrited
target_values = {
"Existing Customer":0,
"Attrited Customer":1
}
df.Attrition_Flag = df.Attrition_Flag.replace(target_values)
# Unique values of each categories
all_categories = ["Attrition_Flag",
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category"]
for category in all_categories:
print(df[category].value_counts(normalize=True)*100)
plt.figure(figsize=(8,6))
sns.countplot(df[category])
plt.show()
Remark [On categorical variables]:
numerical_col = df.select_dtypes(include=np.number).columns.tolist()# getting list of numerical columns
unwanted_numeric = ["CLIENTNUM"]
numeric_cols = np.setdiff1d(numerical_col,unwanted_numeric)
plt.figure(figsize=(15,15))
df_corr = df[numeric_cols].corr()
sns.heatmap(df_corr, annot=True);
Insight(correlation)
# Our interest is on the Attrition Flag attribute and therefore we can filter
# correlation values concerning this attribute
# Taking absolute on the values because we are concerned with correlation of
# either end
df_corr["Attrition_Flag"].abs().sort_values(ascending=False)
Remarks:
sns.relplot(data=df, kind='scatter', \
x='Total_Trans_Amt', y='Total_Trans_Ct',hue='Attrition_Flag');
Remarks:
# lets see how working hours per week vary across different occupation
plt.figure(figsize=(10,9))
sns.boxplot(y="Education_Level", x="Customer_Age", data=df);
plt.figure(figsize=(10,8))
sns.countplot(df.Income_Category, hue=df.Gender,\
order=["Less than $40K","$40K - $60K","$80K - $120K","$120K +","Unknown"]);
Comments:
Females are dominant in the lowest category of income, that is $40 and below.
The majority of males are earning between $80K to $120K
tab1 = pd.crosstab(df.Income_Category,df.Education_Level,margins=True)
print(tab1)
print('-'*100)
tab = pd.crosstab(df.Income_Category,df.Education_Level,normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(17,7))
plt.legend(loc='lower left', frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
Comments:
tab1 = pd.crosstab(df.Education_Level,df.Gender,margins=True)
print(tab1)
print('-'*100)
tab = pd.crosstab(df.Education_Level,df.Gender,normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(17,7))
plt.axhline(0.5, color="black")
plt.legend(loc='lower left', frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
Comments:
Educationally, it appears that females are slightly more educated than men generally.
More specifically:
In the educational level: Doctorate, females are clearly more than men (57%).
Females are also little above men under educational level: uneducated (53%).
tab1 = pd.crosstab(df.Education_Level,df.Attrition_Flag,margins=True)
print(tab1)
print('-'*100)
tab = pd.crosstab(df.Education_Level,df.Attrition_Flag,normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(17,7))
plt.legend(loc='lower left', frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
Observation:
Across all levels of education, customers close their account in huge numbers (e.g +/-80% of customers had kept using credit card services across all levels of education)
The analysis also to suggest that being a credit card holder had nothing to do with the level of education - particularly when it comes to a decision whether to hold-on to credit account or close it.
Moreover, this also suggest that the problem or unhappiness with the bank's credit account is equally experience across the all customers irrespective of their levels of education.
Clearly, the bank has to take drastic measures to reserve the trend if the credit card services are still viewed a crucial revenue source to the bank.
tab1 = pd.crosstab(df.Card_Category,df.Attrition_Flag,margins=True)
print(tab1)
print('-'*120)
tab = pd.crosstab(df.Card_Category,df.Attrition_Flag,normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(17,7))
plt.legend(loc='lower left', frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
Observation:
Irrespective of product type (Card_Category), card holders had closed their account in similar trends across all product types. +/- 80 customers had kept using credit cards services across all card categories.
Card_Category trends are similar to those observable under educational levels.
tab1 = pd.crosstab(df.Gender,df.Attrition_Flag,margins=True)
print(tab1)
print('-'*100)
tab = pd.crosstab(df.Gender,df.Attrition_Flag,normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(17,7))
plt.legend(loc='lower left', frameon=False)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
Remarks:
Both males and females had left credit card services in huge numbers.
More than 80% of credit card holders, across all gender types, had kept using credit card accounts.
# lets us look at normalized chart of age vs marital status
plt.figure(figsize=(15,9))
sns.displot(
data=df,
x="Customer_Age", hue="Marital_Status",
kind="kde", #height=6,
multiple="fill", #clip=(0, None),
#palette="ch:rot=-.25,hue=1,light=.75"
height=8.27, aspect=11.7/8.27
);
Remarks:
No insightful observation.
plt.figure(figsize=(15,7))
sns.histplot(df, x="Customer_Age", hue="Income_Category",stat='density',kde=True,discrete=True)
plt.figure(figsize=(15,9))
sns.scatterplot(x=df.Credit_Limit,y=df.Avg_Open_To_Buy,hue=df.Income_Category)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
plt.figure(figsize=(15,9))
sns.scatterplot(x=df.Credit_Limit,y=df.Avg_Utilization_Ratio,hue=df.Income_Category)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
plt.figure(figsize=(15,9))
sns.scatterplot(x=df.Total_Trans_Amt,y=df.Total_Amt_Chng_Q4_Q1,hue=df.Attrition_Flag)
plt.legend(loc="upper left", bbox_to_anchor=(1,1));
numeric_cols2 = np.setdiff1d(numeric_cols,\
["Attrition_Flag","Dependent_count","Months_Inactive_12_mon",\
"Total_Relationship_Count","Contacts_Count_12_mon"])
sns.pairplot(df[numeric_cols2])
# Function to plot histogram and boxplot for numerical variables as we study the
# central tendency and dispersion for such values
def histogram_boxplot(feature, figsize=(10,8), bins = None):
"""
Boxplot and histogram combined with shared axis
feature: 1-d feature array
figsize: size of fig (default (10,8))
bins: number of bins (default None / auto)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
sharex = True, # x-axis will be shared among all subplots
gridspec_kw = {"height_ratios": (.25, .75)},
figsize = figsize
) # creating the 2 subplots
sns.boxplot(feature, ax=ax_box2, showmeans=True, color='red') # boxplot will be created and a star will indicate the mean value of the column
sns.distplot(feature, kde=False, ax=ax_hist2, bins=bins) if bins else sns.distplot(feature, kde=False, ax=ax_hist2) # For histogram
ax_hist2.axvline(np.mean(feature), color='g', linestyle='--') # Add mean to the histogram
ax_hist2.axvline(np.median(feature), color='black', linestyle='-') # Add median to the histogram
num_cols = list(df.select_dtypes(np.number).columns)
# num_cols.remove("CLIENTNUM").remove("Attrition_Flag")
num_cols = np.setdiff1d(num_cols, ["CLIENTNUM","Attrition_Flag"])
for col in num_cols:
print(col,",Skew", round(df[col].skew(),2))
histogram_boxplot(feature=df[col])
plt.show()
Comments[Skewness, outliers, descriptive statistics]
# A function to treat outliers.
# Replace upper outliers with value of upper whisker and replace lower outliers with value of lower whisker.
def treat_outliers(df, cols):
'''
treats outliers in the columns cols
cols - columns we want to treat,
df - the dataframe
'''
for col in cols:
Q1=df[col].quantile(0.25) # 25th quantile
Q3=df[col].quantile(0.75) # 75th quantile
IQR=Q3-Q1
Lower_Whisker = Q1 - 1.5*IQR
Upper_Whisker = Q3 + 1.5*IQR
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker) # all the values samller than Lower_Whisker will be assigned value of Lower_whisker
# and all the values above upper_whishker will be assigned value of upper_Whisker
return df
# Treat outliers for columns with outliers
df = treat_outliers(df,num_cols)
# Check that outliers has been treated boxplot
plt.figure(figsize=(20,30))
for i, variable in enumerate(num_cols):
plt.subplot(6,3,i+1)
sns.boxplot(df[variable],whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# No more outliers
Remarks (On treating outliers and missing values):
# Replacing ordinal columns - columns with rank of some sort
replaceStruct = {
"Card_Category": {
"Blue": 4,
"Silver": 3,
"Gold": 2,
"Platinum": 1
},
"Education_Level": {
"Unknown": -1,
"Uneducated": 5,
"High School": 4,
"College": 3,
"Graduate": 2,
"Post-Graduate":1,
"Doctorate": 0
},
"Income_Category": {
"Unknown": -1,
"Less than $40K": 4,
"$40K - $60K": 3,
"$60K - $80K": 2,
"$80K - $120K": 1,
"$120K +": 0
}
}
# Nominal columns that will be encoded with One-Hot encoding
one_hot = ["Marital_Status", "Gender"]
# Replace values on ordinal columns with values shown above
df.replace(replaceStruct, inplace=True)
# One hot encoding
df = pd.get_dummies(df, columns=one_hot)
df.head(2)
Attempt [Tried and failed]: Using mid-point values for the income instead of the ordinal values did not help to improve the models so we will not pursue that option.
# Client number are unique identifies. It will not help us
# improve our model performance
df.drop(["CLIENTNUM"], axis=1, inplace=True)
We will calculate 4 metrics - Accuracy, Precision, Recall and F1 but the metric of interest here is recall but the customers .
Recall refers to the percentage of your results which are relevant given all instances. That translate to identifying attrited customers in the dataset. Hyperparameters will, therefore, be tuned using recall_score
df_original["Attrition_Flag"].value_counts(normalize=True)*100
# filter existing customers (class 0) and attrited (class 1)
df_class0 = df.loc[df["Attrition_Flag"]==0]
df_class1 = df.loc[df["Attrition_Flag"]==1]
# Shuffle data in each other class and split them into 3 at 60-20-20 rule
train_class0, val_class0, test_class0 = \
np.split(df_class0.sample(frac=1, random_state=42),
[int(0.6*len(df_class0)), int(0.8*len(df_class0))])
train_class1, val_class1, test_class1 = \
np.split(df_class1.sample(frac=1, random_state=42),
[int(0.6*len(df_class1)), int(0.8*len(df_class1))])
# Join the sets in each class and shuffle them one more time to have the train
# validation and test sets
train = pd.concat([train_class0, train_class1])
val = pd.concat([val_class0, val_class1])
test = pd.concat([test_class0,test_class1])
# Check proportion of the two classes in each set:
print(dict(train["Attrition_Flag"].value_counts(normalize=True)*100))
print(dict(val["Attrition_Flag"].value_counts(normalize=True)*100))
print(dict(test["Attrition_Flag"].value_counts(normalize=True)*100))
# Extract target and features from each set
X_train, y_train = train.drop(['Attrition_Flag'], axis=1), train['Attrition_Flag']
X_val, y_val = val.drop(['Attrition_Flag'], axis=1), val['Attrition_Flag']
X_test, y_test = test.drop(['Attrition_Flag'], axis=1), test['Attrition_Flag']
Remark
# Displaying shapes for each set split
# Train -60%, val-20% and test-20%
print("X_train shape",X_train.shape)
print("y_train shape",y_train.shape)
print("X_val shape", X_val.shape)
print("y_val shape",y_val.shape)
print("X_test shape", X_test.shape)
print("y_test shape",y_test.shape)
# Keeping all the model performance results here
model_results = []
## Function to calculate different metric scores of the model - Accuracy, Recall and Precision
def GetScores(model,model_name=None,time_taken=None):
'''
This function is used to make prediction and to determine model
performance using metrics like: accuracy, precision, recall and f1.
'''
# defining an empty list to store train and test results
score_list=[]
#Predicting on train and tests
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)
pred_test = model.predict(X_test)
#Accuracy of the model
train_acc = model.score(X_train,y_train)
val_acc = model.score(X_val,y_val)
test_acc = model.score(X_test,y_test)
#Recall of the model
train_recall = recall_score(y_train,pred_train)
val_recall = recall_score(y_val,pred_val)
test_recall = recall_score(y_test,pred_test)
#Precision of the model
train_precision = precision_score(y_train,pred_train)
val_precision = precision_score(y_val,pred_val)
test_precision = precision_score(y_test,pred_test)
#Precision of the model
train_f1 = f1_score(y_train,pred_train)
val_f1 = f1_score(y_val,pred_val)
test_f1 = f1_score(y_test,pred_test)
# Model is defined explicitly as function argument or derived from model object as below
if model_name is None:
# extract model name from model object
model_name = str(model).split("(")[0]
# Tuned and models with default parameters may have the same name.
# Generate ext random string and concatinate with the name to make it unique
# ext variable is global so that it can be used by ConfusionMatrix function
global ext
ext = "".join(random.sample("abcdefghijklmneuiqyriur0peq9r69829246rmpf930479bc",3))
model_name = model_name +"_"+ext
# results to return
results = {
"model_name":model_name,
"train_acc": train_acc,
"val_acc": val_acc,
"test_acc":test_acc,
"train_recall": train_recall,
"val_recall": val_recall,
"test_recall": test_recall,
"train_precision": train_precision,
"val_precision": val_precision,
"test_precision": test_precision,
"train_f1": train_f1,
"val_f1": val_f1,
"test_f1": test_f1,
"execution_time": time_taken
}
# To save results to a list for comparison purposes later.
try:
done = [i["model_name"] for i in model_results]
except KeyError as e:
model_results.append(results)
done = [i["model_name"] for i in model_results]
if model_name not in done:
model_results.append(results)
else:
print("Caught up")
# Formating Python dictionary using JSON library
print(json.dumps(results, indent=3))
# returning the scores in dictionary formart
return results
## Function to create confusion matrix
def ConfusionMatrix(model,y_actual,set1,labels=[0, 1], model_name = None, save_as=None):
'''
model : classifier to predict values of X
y_actual : ground truth
set1 is train, val or test.
model_name is the model name that will be used as reference later
save_as is the plot name if the plot has to be saved
'''
# set1 must be either "train", "val" or test otherwise raise an errot
assert set1 in ["train","val","test"], \
"set1 can only take one of these values 'train', 'val' or 'test'"
if model_name is None:
model_name = str(model).split("(")[0]
model_name = model_name +"_"+ ext
if set1 == "test":
# Just to catch that the y_actual passed are actually that of test set
assert len(X_test)==len(y_actual),\
"check that you passed y_actual for test set or fix set1 value"
y_predict = model.predict(X_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - Attrited","Actual - Existing"]],
columns = [i for i in ['Predicted - Attrited','Predicted - Existing']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.title("Confusion Matrix for {} set. Model:{}".format(set1,model_name))
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Save the plot is need be
if save_as != None:
if not os.path.exists("images"):
os.mkdir("images")
plt.savefig("./images/{}_{}".format(set1, save_as))
# Display the plot
plt.show()
elif set1 == "val":
assert len(X_val)==len(y_actual),\
"check that you passed y_actual for val set or fix set1 value"
y_predict = model.predict(X_val)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - Attrited","Actual - Existing"]],
columns = [i for i in ['Predicted - Attrited','Predicted - Existing']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.title("Confusion Matrix for {} set. Model:{}".format(set1,model_name))
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Save the plot is need be
if save_as != None:
if not os.path.exists("images"):
os.mkdir("images")
plt.savefig("./images/{}_{}".format(set1, save_as))
# Display the plot
plt.show()
else :
assert len(X_train)==len(y_actual),\
"check that you passed y_actual for train set or fix set1 value"
y_predict = model.predict(X_train)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - Attrited","Actual - Existing"]],
columns = [i for i in ['Predicted - Attrited','Predicted - Existing']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.title("Confusion Matrix for {} set. Model:{}".format(set1,model_name))
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Save the plot is need be
if save_as != None:
if not os.path.exists("images"):
os.mkdir("images")
plt.savefig("./images/{}_{}".format(set1, save_as))
# Display the plot
plt.show()
# Decision tree with default parameters
dt_default = DecisionTreeClassifier(random_state=42)
dt_default.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Decision Tree Default"
GetScores(dt_default,model_name=name)
ConfusionMatrix(dt_default, y_test,set1="test", model_name = name)
Insights:
pipeline_dt = Pipeline(steps=[
('pca', PCA()),
("model",DecisionTreeClassifier(random_state=42))
])
# model = DecisionTreeClassifier(random_state=42)
param_grid = {
'pca__n_components': [6,12,16],
'model__criterion':["gini","entropy"],
'model__max_depth': [3,6,9,12],
'model__min_samples_split': [2, 4, 6, 8],
'model__max_features': [2,6,12,"auto"]
}
# print(pipeline_dt.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run GridSearchCV
start_time=time.time()
grid_obj = GridSearchCV(pipeline_dt , param_grid = param_grid,scoring=acc_scorer, cv = 5, n_jobs=c_count)
# Use validation set to search for best parameters
grid_obj = grid_obj.fit( X_val, y_val)
time_taken_dtgs = time.time()-start_time
print("It took {} seconds to execute".format(time_taken_dtgs))
# Set the dt_tuned to the best combination of parameters
dt_tuned = grid_obj.best_estimator_
#Print best parameters
print(grid_obj.best_params_)
# Fit the best algorithm to the train data.
dt_tuned.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Decision Tree Tuned GridSearchCV"
GetScores(dt_tuned,model_name=name, time_taken=time_taken_dtgs)
ConfusionMatrix(dt_tuned, y_test,set1="test", model_name = name)
pipeline_dt2 = Pipeline(steps=[
('pca', PCA()),
("model",DecisionTreeClassifier(random_state=42))
])
# model = DecisionTreeClassifier(random_state=42)
param_grid2 = {
'pca__n_components': sp_randint(6,16),
'model__criterion':["gini","entropy"],
'model__max_depth': sp_randint(3,12),
'model__min_samples_split': sp_randint(2,8),
'model__max_features': [2,6,12,"auto"]
}
# print(pipeline_dt2.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run GridSearchCV
start_time=time.time()
rscv = RandomizedSearchCV(pipeline_dt2 , param_distributions = param_grid2,\
scoring=acc_scorer, cv = 5, n_jobs=c_count)
# Use validation set to search for best parameters
rscv = rscv.fit( X_val, y_val)
time_taken_dtrs = time.time()-start_time
print("It took {} seconds to execute".format(time_taken_dtrs))
# Set the dt_tuned to the best combination of parameters
dt_tuned2 = rscv.best_estimator_
#Print best parameters
print(rscv.best_params_)
# Fit the best algorithm to the train data.
dt_tuned2.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Decision Tree Tuned RandomSearchCV"
GetScores(dt_tuned2,model_name=name, time_taken=time_taken_dtrs)
ConfusionMatrix(dt_tuned2, y_test,set1="test", model_name = name)
Insights:
print("It took {} seconds for GridSearchCV".format(time_taken_dtgs))
print("It took {} seconds for RandomizedSearchCV".format(time_taken_dtrs))
# Random forest with default parameters
rf_default = RandomForestClassifier(random_state=42)
rf_default.fit(X_train, y_train)
# Model performance
name = "Random Forest Default"
GetScores(rf_default, model_name=name)
ConfusionMatrix(rf_default,y_test, set1="test", model_name=name)
pipeline_rf_gs = make_pipeline(
(PCA()),
(RandomForestClassifier(random_state=42))
)
# Grid of parameters to choose from
parameters = { "pca__n_components": [5,10,15,20],
"randomforestclassifier__n_estimators": [50,100,200,250],
"randomforestclassifier__min_samples_leaf": list(np.arange(4, 11,3)),
"randomforestclassifier__max_features": [0.4, 0.7, 0.9], #list(np.arange(0.2, 0.7, 0.1)),
"randomforestclassifier__max_samples": [0.4, 0.7, 0.9], #list(np.arange(0.3, 0.7, 0.1)),
"randomforestclassifier__max_depth": [6,9,12]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
start_time = time.time()
grid_obj_rf1 = GridSearchCV(pipeline_rf_gs, parameters, scoring=acc_scorer,cv=5, n_jobs=c_count)
grid_obj_rf1 = grid_obj_rf1.fit(X_val, y_val)
time_taken2 = time.time()-start_time
# print(pipeline_rf_gs.get_params().keys())
# Set the clf to the best combination of parameters
rf_estimator_tuned1 = grid_obj_rf1.best_estimator_
# Fit the best algorithm to the data.
rf_estimator_tuned1.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Random Forest Tuned GridSearchCV"
GetScores(rf_estimator_tuned1,model_name=name, time_taken=time_taken2)
ConfusionMatrix(rf_estimator_tuned1, y_test,set1="test", model_name = name)
pipeline_dt2 = make_pipeline(
(PCA()),
(RandomForestClassifier(random_state=42))
)
# model = DecisionTreeClassifier(random_state=42)
parameters2 = {
"pca__n_components": sp_randint(5,20),
"randomforestclassifier__n_estimators": sp_randint(50,250),
"randomforestclassifier__min_samples_leaf": sp_randint(5,12),
"randomforestclassifier__max_features": uniform(0.4,0.9), #list(np.arange(0.2, 0.7, 0.1)),
"randomforestclassifier__max_samples": uniform(0.4,0.9), #list(np.arange(0.3, 0.7, 0.1)),
"randomforestclassifier__max_depth": sp_randint(3,12)
}
# print(pipeline_dt2.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run GridSearchCV
start_time=time.time()
grid_obj_rf2 = RandomizedSearchCV(pipeline_dt2 , param_distributions = parameters2,\
scoring=acc_scorer, cv = 5, n_jobs=c_count)
# Use validation set to search for best parameters
grid_obj_rf2 = grid_obj_rf2.fit( X_val, y_val)
time_taken2 = time.time()-start_time
print("It took {} seconds to execute".format(time_taken2))
# Set the dt_tuned to the best combination of parameters
rf_estimator_tuned2 = grid_obj_rf2.best_estimator_
#Print best parameters
print(grid_obj_rf2.best_params_)
# Fit the best algorithm to the train data.
rf_estimator_tuned2.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Random Forest Tuned RandomSearchCV"
GetScores(rf_estimator_tuned2,model_name=name, time_taken=time_taken2)
ConfusionMatrix(rf_estimator_tuned2, y_test,set1="test", model_name = name)
# Bagging classfier with default parameters.
# The base estimator is by default Decision Tree classifier
bagging_estimator = BaggingClassifier(random_state=42)
bagging_estimator.fit(X_train, y_train)
name = "Bagging Classifier Default"
GetScores(bagging_estimator,model_name=name)
ConfusionMatrix(bagging_estimator, y_test,set1="test", model_name = name)
pipeline_bagging1 = make_pipeline(
(PCA()),
(BaggingClassifier(random_state=42))
) # By default the base classifier used is Decision Tree. Another option
# could be to use Logistic Model but we will keep it as defaulted
print(pipeline_bagging1.get_params().keys())
# model = DecisionTreeClassifier(random_state=42)
parameters1 = {
"baggingclassifier__n_estimators": np.arange(50,200,50),
"baggingclassifier__max_features": list(np.arange(0.2, 1.0, 0.2)),
"baggingclassifier__max_samples": list(np.arange(0.2, 1.0, 0.2)),
}
# print(pipeline_dt2.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run GridSearchCV
start_time=time.time()
grid_obj_bagging1 = GridSearchCV(pipeline_bagging1 , parameters1,\
scoring=acc_scorer, cv = 5, n_jobs=c_count)
# Use validation set to search for best parameters
grid_obj_bagging1 = grid_obj_bagging1.fit( X_val, y_val)
time_taken2 = time.time()-start_time
print("It took {} seconds to execute".format(time_taken2))
# Set the dt_tuned to the best combination of parameters
bagging_tuned1 = grid_obj_bagging1.best_estimator_
#Print best parameters
print(grid_obj_bagging1.best_params_)
# Fit the best algorithm to the train data.
bagging_tuned1.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Bagging Classifier Tuned GridSearchCV"
GetScores(bagging_tuned1,model_name=name, time_taken=time_taken2)
ConfusionMatrix(bagging_tuned1, y_test,set1="test", model_name = name)
pipeline_bagging2 = make_pipeline(
(PCA()),
(BaggingClassifier(random_state=42))
) # By default the base classifier used is Decision Tree. Another option
# could be to use Logistic Model but we will keep it as defaulted
print(pipeline_bagging2.get_params().keys())
# model = DecisionTreeClassifier(random_state=42)
parameters2 = {
"baggingclassifier__n_estimators": sp_randint(50,200),
"baggingclassifier__max_features": uniform(0.2, 1.0),
"baggingclassifier__max_samples": uniform(0.2, 1.0),
}
# print(pipeline_dt2.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run GridSearchCV
start_time=time.time()
grid_obj_bagging2 = RandomizedSearchCV(pipeline_bagging2 , parameters2,\
scoring=acc_scorer, cv = 5, n_jobs=c_count)
# Use validation set to search for best parameters
grid_obj_bagging2 = grid_obj_bagging2.fit( X_val, y_val)
time_taken2 = time.time()-start_time
print("It took {} seconds to execute".format(time_taken2))
# Set the dt_tuned to the best combination of parameters
bagging_tuned2 = grid_obj_bagging2.best_estimator_
#Print best parameters
print(grid_obj_bagging2.best_params_)
# Fit the best algorithm to the train data.
bagging_tuned2.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Bagging Classifier Tuned RandomSearchCV"
GetScores(bagging_tuned2,model_name=name, time_taken=time_taken2)
ConfusionMatrix(bagging_tuned2, y_test,set1="test", model_name = name)
# Adaboost classifier with default parameters
ada_default = AdaBoostClassifier(random_state=1)
ada_default.fit(X_train,y_train)
#Using above defined function to get accuracy, recall and precision on train and test set
name = "AdaBoost Classifier Default"
GetScores(ada_default, model_name=name)
ConfusionMatrix(ada_default, y_test,model_name=name, set1="test")
# Important features as per the ada_default classifier
importances = ada_default.feature_importances_
indices = np.argsort(importances)
feature_names = list(X_train.columns)
r = {
"Column": feature_names,
"Importance": importances
}
print(20*"#","TOP 3",20*"#")
df_important_ada = pd.DataFrame(r).sort_values(by="Importance", ascending=False)[:3]
df_important_ada
Comment:
# logistic with 10-fold cross validation
logistic_cv = LogisticRegression(random_state=1)
kfold = KFold(n_splits = 10, shuffle=True)
results = cross_val_score(logistic_cv,df.drop(["Attrition_Flag"], axis=1), df["Attrition_Flag"], cv=kfold, scoring='roc_auc',n_jobs=c_count)
print(results)
print("Mean:",results.mean(),"Standard Deviation:", results.std())
# Logistic regression model with default parameters
logistic_default = LogisticRegression(random_state=1)
logistic_default.fit(X_train, y_train)
#Using above defined function to get accuracy, recall and precision on train and test set
name = "Logistic Regression Default"
GetScores(logistic_default, model_name=name)
ConfusionMatrix(logistic_default, y_test,set1="test", model_name=name)
pipeline_logistic = make_pipeline(
(PCA()), LogisticRegression(random_state=42)
)
print(pipeline_logistic.get_params().keys())
# model = DecisionTreeClassifier(random_state=42)
parameters1 = {
"pca__n_components": np.arange(4,25,6),
"logisticregression__class_weight": [{0: 0.8, 1: 0.2}],
"logisticregression__tol":[1e-3,1e-4,1e-5],
"logisticregression__solver": ["newton-cg", "lbfgs", "liblinear"]
}
# print(pipeline_dt2.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run GridSearchCV
start_time=time.time()
grid_obj_logistic1 = GridSearchCV(pipeline_logistic , parameters1,\
scoring=acc_scorer, cv = 5, n_jobs=c_count)
# Use validation set to search for best parameters
grid_obj_logistic1 = grid_obj_logistic1.fit( X_val, y_val)
time_taken = time.time()-start_time
print("It took {} seconds to execute".format(time_taken))
# Set the dt_tuned to the best combination of parameters
logisitic_tuned1 = grid_obj_logistic1.best_estimator_
#Print best parameters
print(grid_obj_logistic1.best_params_)
# Fit the best algorithm to the train data.
logisitic_tuned1.fit(X_train, y_train)
#Using above defined function to get accuracy, recall and precision on train and test set
name = "Logistic Regression Tuned GridSearchCV"
GetScores(logisitic_tuned1, model_name=name, time_taken=time_taken)
ConfusionMatrix(logisitic_tuned1, y_test,set1="test", model_name=name)
pipeline_logistic = make_pipeline(
(PCA()), LogisticRegression(random_state=42)
)
print(pipeline_logistic.get_params().keys())
# model = DecisionTreeClassifier(random_state=42)
parameters2 = {
"pca__n_components": sp_randint(4,25),
"logisticregression__class_weight": [{0: 0.5, 1: 0.5}],
"logisticregression__tol":[1e-3,1e-4,1e-5],
"logisticregression__solver": ["newton-cg", "lbfgs", "liblinear"]
}
# print(pipeline_dt2.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Run GridSearchCV
start_time=time.time()
grid_obj_logistic2 = RandomizedSearchCV(pipeline_logistic , parameters2,\
scoring=acc_scorer, cv = 5, n_jobs=c_count)
# Use validation set to search for best parameters
grid_obj_logistic2 = grid_obj_logistic2.fit( X_val, y_val)
time_taken = time.time()-start_time
print("It took {} seconds to execute".format(time_taken))
# Set the dt_tuned to the best combination of parameters
logisitic_tuned2 = grid_obj_logistic2.best_estimator_
#Print best parameters
print(grid_obj_logistic2.best_params_)
# Fit the best algorithm to the train data.
logisitic_tuned2.fit(X_train, y_train)
#Using above defined function to get accuracy, recall and precision on train and test set
name = "Logistic Regression Tuned RandomizedSearchCV"
GetScores(logisitic_tuned2, model_name=name, time_taken=time_taken)
ConfusionMatrix(logisitic_tuned2, y_test,set1="test", model_name=name)
# Oversampling minority class (Attrited) using SMOTE algorithm
print("Before UpSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before UpSampling, counts of label '0': {} \n".format(sum(y_train==0)))
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 1, random_state=1) #Synthetic Minority Over Sampling Technique
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())
print("After UpSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_train_res==0)))
print('After UpSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
# fit Logistic model on upsampled data
logistic_model_upsampled = LogisticRegression(random_state=1)
logistic_model_upsampled.fit(X_train_res, y_train_res)
#Using above defined function to get accuracy, recall and precision on train and test set
name = "Logistic Regression Oversampling SMOTE"
GetScores(logistic_model_upsampled, model_name=name, time_taken=time_taken)
ConfusionMatrix(logistic_model_upsampled, y_test,set1="test", model_name=name)
Insight:
# Downsampling
train_data = pd.DataFrame(y_train).join(X_train, how='outer')
print("Training rows", len(train_data))
attrited_indices = train_data[train_data['Attrition_Flag'] == 0].index
attrited = len(train_data[train_data['Attrition_Flag'] == 0])
print("Attried customers", len(attrited_indices))
existing_indices = train_data[train_data['Attrition_Flag'] == 1].index
existing = len(train_data[train_data['Attrition_Flag'] == 1])
print("Existing customers", len(existing_indices))
random_indices = np.random.choice( existing_indices, 971 , replace=False)
all_sampled_indices = np.concatenate([attrited_indices,random_indices])
train_downsampled = df.loc[all_sampled_indices]
train_downsampled
print("On downsampling: ")
print("Attrited", len(train_downsampled[train_data['Attrition_Flag'] == 0]))
print("Existing", len(train_downsampled[train_data['Attrition_Flag'] == 1]))
X_ds = train_downsampled.drop(["Attrition_Flag"], axis=1)
y_ds = train_downsampled["Attrition_Flag"]
X_train_ds, X_test_ds, y_train_ds, y_test_ds = train_test_split(X_ds, y_ds, test_size = 0.3, random_state=42)
# fit Logistic model on upsampled data
logistic_model_downsampled = LogisticRegression(random_state=1)
logistic_model_downsampled.fit(X_train_ds, y_train_ds)
#Using above defined function to get accuracy, recall and precision on train and test set
name = "Logistic Regression Downsampled"
GetScores(logistic_model_downsampled, model_name=name, time_taken=time_taken)
ConfusionMatrix(logistic_model_downsampled, y_test,set1="test", model_name=name)
Insight:
pipeline_ada = make_pipeline(
(PCA()),
(AdaBoostClassifier(random_state=42))
)
# print(pipeline_ada.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Parameters grid
parameters_ada = {
"pca__n_components": np.arange(4,21,5),
#Let's try different max_depth for base_estimator
"adaboostclassifier__base_estimator":[DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=6),DecisionTreeClassifier(max_depth=12)],
"adaboostclassifier__n_estimators": np.arange(30,151,20),
"adaboostclassifier__learning_rate":np.arange(0.1,2,0.4)
}
# Run the grid search
start = time.time()
grid_obj_ada = GridSearchCV(pipeline_ada, parameters_ada, scoring=acc_scorer,cv=5, n_jobs=c_count)
grid_obj_ada = grid_obj_ada.fit(X_val, y_val)
time_taken = time.time()-start
print("The execution of time is {}".format(time_taken))
# The best parameters
# print(grid_obj_ada.get_params())
# Set the clf to the best combination of parameters
ada_tuned = grid_obj_ada.best_estimator_
# Fit the best algorithm to the data.
ada_tuned.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "AdaBoost Classifier Tuned GridSearchCV"
GetScores(ada_tuned,model_name=name, time_taken=time_taken)
ConfusionMatrix(ada_tuned, y_test,set1="test", model_name = name)
pipeline_ada2 = make_pipeline(
(PCA()),
(AdaBoostClassifier(random_state=42))
)
# print(pipeline_ada.get_params().keys())
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
#Parameters grid
parameters_ada2 = {
"pca__n_components": sp_randint(4,21),
#Let's try different max_depth for base_estimator
"adaboostclassifier__base_estimator":[DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=6),DecisionTreeClassifier(max_depth=12)],
"adaboostclassifier__n_estimators": sp_randint(30,151),
"adaboostclassifier__learning_rate":uniform(0.1,2)
}
# Run the grid search
start = time.time()
grid_obj_ada2 = RandomizedSearchCV(pipeline_ada2, parameters_ada2, scoring=acc_scorer,cv=5, n_jobs=c_count)
grid_obj_ada2 = grid_obj_ada2.fit(X_val, y_val)
time_taken = time.time()-start
print("The execution of time is {}".format(time_taken))
# The best parameters
# print(grid_obj_ada2.get_params())
# Set the clf to the best combination of parameters
ada_tuned2 = grid_obj_ada2.best_estimator_
# Fit the best algorithm to the data.
ada_tuned2.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "AdaBoost Classifier Tuned RandomSearchCV"
GetScores(ada_tuned2,model_name=name, time_taken=time_taken)
ConfusionMatrix(ada_tuned2, y_test,set1="test", model_name = name)
# Gradient Boosting Model
gbm_default = GradientBoostingClassifier(random_state=1)
gbm_default.fit(X_train,y_train)
#Using above defined function to get accuracy, recall and precision on train and test set
name = "Gradient Boost Classifier"
GetScores(gbm_default, model_name=name)
ConfusionMatrix(gbm_default, y_test,model_name=name, set1="test")
importances = gbm_default.feature_importances_
indices = np.argsort(importances)
feature_names = list(X_train.columns)
r = {
"Column": feature_names,
"Importance": importances
}
print(20*"#","TOP 3",20*"#")
pd.DataFrame(r).sort_values(by="Importance", ascending=False)[:3]
# Define the pipeline with two processes: PCA and the classifier
pipeline_gbm = make_pipeline(
(PCA()),
(GradientBoostingClassifier(random_state=1))
)
print(pipeline_gbm.get_params().keys())
# Grid of parameters to choose from
parameter_gbm = {
"gradientboostingclassifier__n_estimators": np.arange(50,251,100),
"gradientboostingclassifier__max_depth": np.arange(1,15,5),
"gradientboostingclassifier__max_features": np.arange(0.4,1,0.2)
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
start = time.time()
grid_obj_gbm = GridSearchCV(pipeline_gbm, parameter_gbm, scoring=acc_scorer,cv=5, n_jobs=c_count)
grid_obj_gbm = grid_obj_gbm.fit(X_val, y_val)
time_taken = time.time()-start
print("The execution of time is {}".format(time_taken))
# Set the clf to the best combination of parameters
gbm_tuned = grid_obj_gbm.best_estimator_
# Fit the best algorithm to the data.
gbm_tuned.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Gradient Boost Classifier Tuned GridSearchCV"
GetScores(gbm_tuned,model_name=name, time_taken=time_taken)
ConfusionMatrix(gbm_tuned, y_test,set1="test", model_name = name)
# Define the pipeline with two processes: PCA and the classifier
pipeline_gbm1 = make_pipeline(
(PCA()),
(GradientBoostingClassifier(random_state=1))
)
print(pipeline_gbm1.get_params().keys())
# Grid of parameters to choose from
parameter_gbm1 = {
"gradientboostingclassifier__n_estimators": sp_randint(50,251),
"gradientboostingclassifier__max_depth": sp_randint(1,15),
"gradientboostingclassifier__max_features": uniform(0.4,1)
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search
start = time.time()
grid_obj_gbm1 = RandomizedSearchCV(pipeline_gbm1, parameter_gbm1, scoring=acc_scorer,cv=5, n_jobs=c_count)
grid_obj_gbm1 = grid_obj_gbm1.fit(X_val, y_val)
time_taken = time.time()-start
print("The execution of time is {}".format(time_taken))
# Set the clf to the best combination of parameters
gbm_tuned1 = grid_obj_gbm1.best_estimator_
# Fit the best algorithm to the data.
gbm_tuned1.fit(X_train, y_train)
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "Gradient Boost Classifier Tuned RandomSearchCV"
GetScores(gbm_tuned1, model_name=name, time_taken=time_taken)
ConfusionMatrix(gbm_tuned1, y_test, set1="test", model_name = name)
# Extreme Gradient Boosting Classifier alias XGBoost
xgb_default = XGBClassifier(random_state=1)
xgb_default.fit(X_train,y_train,eval_metric="logloss")
#Using above defined function to get accuracy, recall and precision on train and test set
name = "XGBoost Model Default"
GetScores(xgb_default, model_name=name)
ConfusionMatrix(xgb_default,y_test, model_name=name, set1="test")
Note: For some reason the use of pipeline did not work in XGBoost models.
# Choose the type of classifier.
xgb_tuned = XGBClassifier(random_state=1,use_label_encoder=False)
# Grid of parameters to choose from
## add from
parameters = {
"n_estimators": [50,100,150],
"scale_pos_weight":[2,5],
"subsample":[0.5,0.7,0.9,1],
"learning_rate":[0.2,0.05],
"gamma":[0,1,3],
"colsample_bytree":[0.7,0.9,1],
"colsample_bylevel":[0.7,0.9,1]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Start the time
start_time = time.time()
# Run the grid search
grid_obj = GridSearchCV(xgb_tuned, parameters,scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_val, y_val, eval_metric="logloss")
# Evaluate time taken
time_taken = time.time()-start_time
print("Execution time: ",time_taken)
# Set the clf to the best combination of parameters
xgb_tuned = grid_obj.best_estimator_
# Fit the best algorithm to the data.
xgb_tuned.fit(X_train, y_train, eval_metric="logloss")
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "XGBoost Classifier Tuned GridSearchCV"
GetScores(xgb_tuned, model_name=name, time_taken=time_taken)
ConfusionMatrix(xgb_tuned, y_test, set1="test", model_name = name)
# Model object
pipeline_xgb1 = XGBClassifier(random_state=1, use_label_encoder=False)
# Grid of parameters
parameters_xgb1 = {
"n_estimators": sp_randint(10,100),
"subsample": [0.5,0.7,0.9,1],
"learning_rate":[0.01,0.1,0.2,0.05],
"gamma":sp_randint(0,3),
"colsample_bytree": [0.5,0.7,0.9,1],
"colsample_bylevel":[0.5,0.7,0.9,1]
}
# Parameters are tuned to the best performance based on this metric
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Defind the parameters on the grid
# Start the time
start_time = time.time()
# Grid Seach
grid_obj_xgb1 = RandomizedSearchCV(pipeline_xgb1, parameters_xgb1,scoring=acc_scorer, cv=5)
grid_obj_xgb1 = grid_obj_xgb1.fit(X_val, y_val, eval_metric="logloss")
# Evaluate time taken
time_taken = time.time()-start_time
print("Execution time: ",time_taken)
# Set the clf to the best combination of parameters
xgb_tuned1 = grid_obj_xgb1.best_estimator_
# Fit algorithm on the best parameters
xgb_tuned1.fit(X_train, y_train, eval_metric="logloss")
#Call get_score and ConfusionMatrix functions to evaluate the said model
name = "XGBoost Classifier Tuned RandomSearchCV"
GetScores(xgb_tuned1, model_name=name, time_taken=time_taken)
ConfusionMatrix(xgb_tuned1, y_test, set1="test", model_name = name)
pd.DataFrame(model_results).round(4)#.to_csv("resultsSUBMISSSION.csv", index=False)
Comments: